{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 4 - Introduction to Autoregressive and Automated Methods for Time Series Forecasting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime as dt\n",
    "import os\n",
    "import shutil\n",
    "import warnings\n",
    "from collections import UserDict\n",
    "from glob import glob\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from common.utils import load_data, mape\n",
    "from IPython.display import Image\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "pd.options.display.float_format = \"{:,.2f}\".format\n",
    "np.set_printoptions(precision=2)\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = \"./data\"\n",
    "ts_data_load = load_data(data_dir)[[\"load\"]]\n",
    "ts_data_load.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lag plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.plotting import lag_plot\n",
    "\n",
    "plt.figure()\n",
    "\n",
    "lag_plot(ts_data_load)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Autocorrelation plot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Autocorrelation Plot Results from ts_data_load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.plotting import autocorrelation_plot\n",
    "\n",
    "plt.figure()\n",
    "\n",
    "autocorrelation_plot(ts_data_load)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Autocorrelation Plot Results from ts_data_load_subset (First week of August 2014)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_data_load = load_data(\"data/\")[[\"load\"]]\n",
    "ts_data_load.head()\n",
    "\n",
    "ts_data_load_subset = ts_data_load[\"2014-08-01\":\"2014-08-07\"]\n",
    "\n",
    "from pandas.plotting import autocorrelation_plot\n",
    "\n",
    "plt.figure()\n",
    "\n",
    "autocorrelation_plot(ts_data_load_subset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Autocorrelation function (acf) plot on ts_data_load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot\n",
    "from statsmodels.graphics.tsaplots import plot_acf\n",
    "\n",
    "plot_acf(ts_data_load)\n",
    "pyplot.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Autocorrelation function (acf) plot on ts_data_load subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot\n",
    "from statsmodels.graphics.tsaplots import plot_acf\n",
    "\n",
    "plot_acf(ts_data_load_subset)\n",
    "pyplot.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Partial correlation function (pacf) plot on ts_data_load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot\n",
    "from statsmodels.graphics.tsaplots import plot_pacf\n",
    "\n",
    "plot_pacf(ts_data_load, lags=20)\n",
    "pyplot.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Partial correlation function (pacf) plot on ts_data_load subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot\n",
    "from statsmodels.graphics.tsaplots import plot_pacf\n",
    "\n",
    "plot_pacf(ts_data_load_subset, lags=30)\n",
    "pyplot.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Autoregressive method class in Statsmodels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import pandas_datareader as pdr\n",
    "import seaborn as sns\n",
    "from statsmodels.tsa.api import acf, graphics, pacf\n",
    "from statsmodels.tsa.ar_model import AutoReg, ar_select_order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoReg(ts_data_load['load'], 1)\n",
    "results = model.fit()\n",
    "print(results.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Note: AutoReg supports describing the same covariance estimators as OLS. Below, we use cov_type=\"HC0\", which is White’s covariance estimator. While the parameter estimates are the same, all of the quantities that depend on the standard error change."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = model.fit(cov_type=\"HC0\")\n",
    "print(res.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set_style(\"darkgrid\")\n",
    "pd.plotting.register_matplotlib_converters()\n",
    "sns.mpl.rc(\"figure\", figsize=(16, 6))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = res.plot_predict(720, 840)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(16, 9))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = res.plot_diagnostics(fig=fig, lags=20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prepare the ts_data_load dataset for forecasting task with AutoReg() function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_start_dt = \"2014-11-01 00:00:00\"\n",
    "test_start_dt = \"2014-12-30 00:00:00\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = ts_data_load.copy()[\n",
    "    (ts_data_load.index >= train_start_dt) & (ts_data_load.index < test_start_dt)\n",
    "][[\"load\"]]\n",
    "test = ts_data_load.copy()[ts_data_load.index >= test_start_dt][[\"load\"]]\n",
    "\n",
    "print(\"Training data shape: \", train.shape)\n",
    "print(\"Test data shape: \", test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = MinMaxScaler()\n",
    "train[\"load\"] = scaler.fit_transform(train)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[\"load\"] = scaler.transform(test)\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "HORIZON = 3\n",
    "print(\"Forecasting horizon:\", HORIZON, \"hours\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_shifted = test.copy()\n",
    "\n",
    "for t in range(1, HORIZON):\n",
    "    test_shifted[\"load+\" + str(t)] = test_shifted[\"load\"].shift(-t, freq=\"H\")\n",
    "\n",
    "test_shifted = test_shifted.dropna(how=\"any\")\n",
    "test_shifted.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "training_window = 720\n",
    "\n",
    "train_ts = train[\"load\"]\n",
    "test_ts = test_shifted\n",
    "\n",
    "history = [x for x in train_ts]\n",
    "history = history[(-training_window):]\n",
    "\n",
    "predictions = list()\n",
    "\n",
    "for t in range(test_ts.shape[0]):\n",
    "    model = AutoReg(history, 1)\n",
    "    model_fit = model.fit()\n",
    "    yhat = model_fit.forecast(steps=HORIZON)\n",
    "    predictions.append(yhat)\n",
    "    obs = list(test_ts.iloc[t])\n",
    "    history.append(obs[0])\n",
    "    history.pop(0)\n",
    "    print(test_ts.index[t])\n",
    "    print(t + 1, \": predicted =\", yhat, \"expected =\", obs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Autoregressive Integrated Moving Average method in Statsmodels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime as dt\n",
    "import math\n",
    "import os\n",
    "import warnings\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from common.utils import load_data, mape\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from statsmodels.tsa.statespace.sarimax import SARIMAX\n",
    "\n",
    "%matplotlib inline\n",
    "pd.options.display.float_format = \"{:,.2f}\".format\n",
    "np.set_printoptions(precision=2)\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = \"./data\"\n",
    "ts_data_load = load_data(data_dir)[[\"load\"]]\n",
    "ts_data_load.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_start_dt = \"2014-11-01 00:00:00\"\n",
    "test_start_dt = \"2014-12-30 00:00:00\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = ts_data_load.copy()[\n",
    "    (ts_data_load.index >= train_start_dt) & (ts_data_load.index < test_start_dt)\n",
    "][[\"load\"]]\n",
    "test = ts_data_load.copy()[ts_data_load.index >= test_start_dt][[\"load\"]]\n",
    "\n",
    "print(\"Training data shape: \", train.shape)\n",
    "print(\"Test data shape: \", test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = MinMaxScaler()\n",
    "train[\"load\"] = scaler.fit_transform(train)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[\"load\"] = scaler.transform(test)\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "HORIZON = 3\n",
    "print(\"Forecasting horizon:\", HORIZON, \"hours\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "order = (4, 1, 0)\n",
    "seasonal_order = (1, 1, 0, 24)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SARIMAX(endog=train, order=order, seasonal_order=seasonal_order)\n",
    "results = model.fit()\n",
    "\n",
    "print(results.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_shifted = test.copy()\n",
    "\n",
    "for t in range(1, HORIZON):\n",
    "    test_shifted[\"load+\" + str(t)] = test_shifted[\"load\"].shift(-t, freq=\"H\")\n",
    "\n",
    "test_shifted = test_shifted.dropna(how=\"any\")\n",
    "test_shifted.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "training_window = 720\n",
    "\n",
    "train_ts = train[\"load\"]\n",
    "test_ts = test_shifted\n",
    "\n",
    "history = [x for x in train_ts]\n",
    "history = history[(-training_window):]\n",
    "\n",
    "predictions = list()\n",
    "\n",
    "order = (2, 1, 0)\n",
    "seasonal_order = (1, 1, 0, 24)\n",
    "\n",
    "for t in range(test_ts.shape[0]):\n",
    "    model = SARIMAX(endog=history, order=order, seasonal_order=seasonal_order)\n",
    "    model_fit = model.fit()\n",
    "    yhat = model_fit.forecast(steps=HORIZON)\n",
    "    predictions.append(yhat)\n",
    "    obs = list(test_ts.iloc[t])\n",
    "    history.append(obs[0])\n",
    "    history.pop(0)\n",
    "    print(test_ts.index[t])\n",
    "    print(t + 1, \": predicted =\", yhat, \"expected =\", obs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eval_df = pd.DataFrame(\n",
    "    predictions, columns=[\"t+\" + str(t) for t in range(1, HORIZON + 1)]\n",
    ")\n",
    "eval_df[\"timestamp\"] = test.index[0 : len(test.index) - HORIZON + 1]\n",
    "eval_df = pd.melt(eval_df, id_vars=\"timestamp\", value_name=\"prediction\", var_name=\"h\")\n",
    "eval_df[\"actual\"] = np.array(np.transpose(test_ts)).ravel()\n",
    "eval_df[[\"prediction\", \"actual\"]] = scaler.inverse_transform(\n",
    "    eval_df[[\"prediction\", \"actual\"]]\n",
    ")\n",
    "eval_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if HORIZON > 1:\n",
    "    eval_df[\"APE\"] = (eval_df[\"prediction\"] - eval_df[\"actual\"]).abs() / eval_df[\n",
    "        \"actual\"\n",
    "    ]\n",
    "    print(eval_df.groupby(\"h\")[\"APE\"].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\n",
    "    \"One-step forecast MAPE: \",\n",
    "    (\n",
    "        mape(\n",
    "            eval_df[eval_df[\"h\"] == \"t+1\"][\"prediction\"],\n",
    "            eval_df[eval_df[\"h\"] == \"t+1\"][\"actual\"],\n",
    "        )\n",
    "    )\n",
    "    * 100,\n",
    "    \"%\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\n",
    "    \"Multi-step forecast MAPE: \",\n",
    "    mape(eval_df[\"prediction\"], eval_df[\"actual\"]) * 100,\n",
    "    \"%\",\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "jupytext": {
   "encoding": "# -*- coding: utf-8 -*-",
   "formats": "ipynb,py:light"
  },
  "kernel_info": {
   "name": "python3"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  },
  "nteract": {
   "version": "nteract-front-end@1.0.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
